Introduction

The purpose of this document is to examine codon usage patterns at the genome-wide level and for functional subsets, across four Strongyloides spp. and C. elegans.

For the functional subsets, six groups are analyzed:
1) Top 2% of codon adapted genes in each genome
2) Bottom 2% of codon adapted genes in each genome
3) Top 2% of GC-rich genes in each genome
4) Top 2% of AT-rich genes in each genome

Data Preprocessing

Full lists of CDS sequences for S. stercoralis, S. ratti, S. papillosus, S. venezuelensis and C. elegans were download from Wormbase ParaSite (v15) on November 17, 2020. The .fa files were used as input to the Strongyloides Codon Adapter App. For each species an excel report containing the computed codon adaptation index values relative to S. ratti and C. elegans was downloaded; these file are uploaded the code chunks below.

Analysis/Results

Filter Genome Data

For each species, load data listing GC content and CAI indeces for the full list of CDS elements in genome. The full list contains transcript level information. If downstream tidy version of this data is avliable or the filtered versions are avaliable as files, don’t regenerate them.

if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
  
  
  temp <- c(Ss = '../Data/Ss_Codon_Usage_Report.xlsx',
            Sr = '../Data/Sr_Codon_Usage_Report.xlsx',
            Sp = '../Data/Sp_Codon_Usage_Report.xlsx',
            Sv = '../Data/Sv_Codon_Usage_Report.xlsx',
            Bm = '../Data/Bm_Codon_Usage_Report.xlsx',
            Ce = '../Data/Ce_Codon_Usage_Report.xlsx')
  dat.genomic <- sapply(temp, read_excel, 
                        sheet = 1, 
                        skip = 3, 
                        col_names = T, 
                        simplify = F, 
                        USE.NAMES = T)
  # This data contains transcript level information. 
  # Commented code would trim rows such that in cases where there are 
  # multiple transcripts per gene, only  
  # values from the longest transcript would be included.
  # This was originally included to permit comparison to gene expression databases
  # but is no longer required.
  dat.genomic.cleaned <- lapply(dat.genomic, function(x){
    x %>%
      dplyr::mutate(transcriptID = geneID)
    # dplyr::rename(transcriptID = geneID) %>%
    # dplyr::mutate(geneID = str_remove_all(transcriptID, "\\.[0-9]$")) %>%
    # dplyr::mutate(geneID = str_remove_all(geneID, "[a-c]$")) %>%
    # dplyr::group_by(geneID) %>%
    # dplyr::slice_max(n = 1, order_by = str_length(`cDNA sequence`), 
    #                   with_ties = FALSE) %>%
    #dplyr::relocate(geneID, .before = everything())
    
  })
  names(dat.genomic.cleaned) <- c("Ss", "Sr", "Sp", "Sv", "Bm", "Ce")
  
}

Tidy Genome Data

Tidy the genomic CAI values for downstream plotting and analysis. This can be computationally intensive, so if the code is run, a tidy version will get saved at the end of the Markdown file as a .csv file. If that file exists, don’t re-run this code.

if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("geneID") %>%
    unlist() %>%
    as_tibble_col(column_name = "geneID") %>%
    group_by(geneID)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("transcriptID") %>%
    unlist() %>%
    as_tibble_col(column_name = "transcriptID") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("Sr_CAI") %>%
    unlist() %>%
    as_tibble_col(column_name = "Sr_CAI") %>%
    add_column(dat.genomic.df, .)
  
   dat.genomic.df <- dat.genomic.cleaned %>%
    map("Bm_CAI") %>%
    unlist() %>%
    as_tibble_col(column_name = "Bm_CAI") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("Ce_CAI") %>%
    unlist() %>%
    as_tibble_col(column_name = "Ce_CAI") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("GC") %>%
    unlist() %>%
    as_tibble_col(column_name = "GC") %>%
    add_column(dat.genomic.df, .)
  
  # Add the column that species which species the gene is from
  # by matching to the original lists of genes, 
  # the group
  dat.genomic.df <- dat.genomic.df %>%
    dplyr::mutate(species = case_when(
      transcriptID %in% dat.genomic$Ss$geneID ~ 'Ss',
      transcriptID %in% dat.genomic$Sr$geneID ~ 'Sr',
      transcriptID %in% dat.genomic$Sp$geneID ~ 'Sp',
      transcriptID %in% dat.genomic$Sv$geneID ~ 'Sv',
      transcriptID %in% dat.genomic$Bm$geneID ~ 'Bm',
      transcriptID %in% dat.genomic$Ce$geneID ~ 'Ce',
    ))
  dat.genomic.df <- group_by(dat.genomic.df, species)
  
}

# Save a tidy version of the complied genomic CAI value dataset if it doesn't exist
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
  write.table(dat.genomic.df, 
              file = "../Data/tidy_genomic_CAI.csv", 
              sep = ",", 
              col.names = T, 
              row.names = FALSE)
  
}

CAI: Genome-wide Distribution Plot

This section takes the calculated codon adaptation index values for all predicted coding sequences, and generates a density curve for each species. Ideally, this plot could be used to benchmark the calculated CAI values against other metrics of codon bias in these species. For example, Mitreva et al 2006 reported effective number of codons (ENC) as a measure of gene-wise codon bias, across species. They report that “mean ENC across all sampled nemtaode species is 46.7 +/- 5.1” but that S. stercoralis and S. ratti are outliers with low ENC values (~35). Note: ENC values range from 20 for highly biased to 61 for no bias. Thus, benchmarking from Mitreva et al 2006, we might expect that Strongyloides species will have greater codon bias than C. elegans. Unfortunately, although they do report measuring distribution of ENC values across all transcripts for each species, they only have plots for S. ratti, P. trichosuri and P. pacificus.

dat.genomic.df <- suppressWarnings(
  read.csv("../Data/tidy_genomic_CAI.csv", 
           header = TRUE)) %>%
  as_tibble() 

species.specific.dat.df <- dat.genomic.df %>%
  dplyr::mutate(species_CAI = case_when(
    species == 'Ce' ~ Ce_CAI,
    species == 'Bm' ~ Bm_CAI,
    TRUE ~ Sr_CAI
  )) %>%
  dplyr::mutate(outgroup_CAI = case_when(
    species == 'Ce' ~ Sr_CAI,
    species != 'Ce' ~ Ce_CAI
  )) %>%
  #dplyr:: select(geneID, species_CAI, species) %>%
  group_by(species)


plot_distributions <- function(dat,
                               x,
                               title,
                               xlabel,
                               caption = "",
                               ylabel = "Density (scaled to maximum of 1)",
                               xlim = c(0,1),
                               ylim = c(0,1)) {
  dat %>%
    ggplot(aes(x = x, y = after_stat(ndensity), color = species)) +
    geom_freqpoly(bins = 50) +
    scale_color_manual(values = c("seagreen4", "grey4","steelblue4", 
                                  "coral4", "darkgoldenrod4", 
                                  "darkorchid4"),
                       name = "Species",
                       labels = c("B. malayi","C. elegans", "S. papillosus", 
                                  "S. ratti", "S. stercoralis",
                                  "S. venezuelensis")) +
    labs(title = title,
         subtitle = "Data includes all predicted coding sequences",
         x = xlabel,
         y = ylabel,
         caption = caption) +
    coord_equal(xlim = xlim, ylim = ylim) +
    theme_bw() +
    theme(plot.title.position = "plot",
          plot.caption.position = "panel",
          plot.title = element_text(face = "bold",
                                    size = 10, hjust = 0),
          plot.subtitle = element_text(size = 8),
          legend.title = element_text(size = 8),
          axis.title = element_text(size = 8),
          axis.text = element_text(size = 8),
          axis.text.x = element_text(angle = 45, hjust = 1))
}
species_adaptiveness_plot <- plot_distributions(species.specific.dat.df,
                                                species.specific.dat.df$species_CAI,
                                                "Species-wide codon adaptiveness",
                                                "Codon bias relative to \n genus coding usage rules (CAI)",
                                                "Strongyloides species values relative to S. ratti usage rules
       C. elegans values relative to C. elegans usage rules.
       B. malayi values relative to B. malayi ussage rules.")

species_adaptiveness_plot

Notes re: the above plot - C. elegans shows overall lower codon bias compared to the Strongyloides species, as predicted from the Mitreva et al 2006 results. Reminder that Mitreva et al reported that this difference is greatly affected by the GC content; more AT rich speices like the Strongyloides species have greater codon usage biases. Also, it looks like the Strongyloides subclades (S. ratti - S. stercoralis; S. papillosus - S. venezuelensis) cluster together.

Kruskal-Wallis Test with Dunn’s Post tests for differences in genomic CAI values across species

# Kruskal-Wallis testing of genomic CAI
one.way <- kruskal.test(species_CAI ~ species, data = species.specific.dat.df)

# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.dat.df$species_CAI, species.specific.dat.df$species, method = "bonferroni")
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 67968.8801, df = 5, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                                  (Bonferroni)                                  
## Col Mean-|
## Row Mean |         Bm         Ce         Sp         Sr         Ss
## ---------+-------------------------------------------------------
##       Ce |   239.1902
##          |    0.0000*
##          |
##       Sp |   140.9085  -90.93966
##          |    0.0000*    0.0000*
##          |
##       Sr |   59.54179  -156.7374  -71.05939
##          |    0.0000*    0.0000*    0.0000*
##          |
##       Ss |   75.88232  -141.8075  -55.85025   14.81922
##          |    0.0000*    0.0000*    0.0000*    0.0000*
##          |
##       Sv |   137.4337  -89.24383  -0.668115   69.17591   54.20444
##          |    0.0000*    0.0000*     1.0000    0.0000*    0.0000*
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2

GC: Genome-wide Distribution Plot

This section takes the calculated GC content values for all predicted coding sequences, and generates a density curve for each species.

species.specific.GC.df <- dat.genomic.df %>%
  dplyr:: select(geneID, GC, species) %>%
  group_by(species)



species_GC_plot <- plot_distributions(species.specific.GC.df,
                                      species.specific.GC.df$GC,
                                      "Species-wide GC content",
                                      "GC content")

species_GC_plot

Kruskal-Wallis Test with Dunn’s Post tests for differences in genomic GC values across species

# Kruskal-Wallis testing of genomic GC
one.way <- kruskal.test(GC ~ species, data = species.specific.GC.df)

# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.GC.df$GC, species.specific.GC.df$species, method = "bonferroni")
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 67947.0291, df = 5, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                                  (Bonferroni)                                  
## Col Mean-|
## Row Mean |         Bm         Ce         Sp         Sr         Ss
## ---------+-------------------------------------------------------
##       Ce |  -47.12620
##          |    0.0000*
##          |
##       Sp |   92.70032   157.3087
##          |    0.0000*    0.0000*
##          |
##       Sr |   130.6162   190.8436   48.50346
##          |    0.0000*    0.0000*    0.0000*
##          |
##       Ss |   125.6135   186.4502   42.18947  -6.400016
##          |    0.0000*    0.0000*    0.0000*    0.0000*
##          |
##       Sv |   98.32093   161.7153   7.855613  -40.54231  -34.22333
##          |    0.0000*    0.0000*    0.0000*    0.0000*    0.0000*
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2

Analyze Codon Adaptation Index (CAI) values displayed by gene sets of interest

Pull Select Percentiles of Codon Adapted Genes

Take genome-wide CAI values, and for each species filter the top 2% and also bottom 2% of codon adapted genes. Will run GO analysis on these to determine whether there are enriched GO Terms.

percentile = 0.02

# For following analyses, only look at Strongyloides and C. elegans, not B. malayi
species.specific.dat.df<-species.specific.dat.df %>%
  dplyr::filter(species != "Bm")

dat.Top <- species.specific.dat.df %>%
  #dplyr::filter(species != "Ce") %>%
  dplyr::mutate(Description = factor("Top")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::slice_max(order_by = species_CAI, prop = percentile) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))

tally(dat.Top)
dat.Bot <- species.specific.dat.df %>%
  # dplyr::filter(species != "Ce") %>%
  dplyr::mutate(Description = factor("Bot")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::slice_max(order_by = desc(species_CAI), prop = percentile) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))

tally(dat.Bot)

Plot Relative CAI vs GC content for Gene Sets

This section/plot asks questions like: Are the most-codon-adapted Strongyloides genes less codon adapted relative to the C. elegans usage rules? Similarly, are the least-codon-adapted genes less codon adapted relative to the other usage rule. Also, is there a consistent relationship between the degree of codon adaptation and GC content (a common confounding factor).

Genus CAI vs GC Plot

tbl <- rbind(dat.Top, dat.Bot) %>%
  mutate(Description = recode(Description, Top = "Most-adapted genes",
                              Bot = "Least-adapted genes"))

ggplot(tbl, aes(species_CAI,GC, Description)) +
  geom_point(data = species.specific.dat.df,
             mapping = aes(x = species_CAI, y = GC),color = "grey", shape = 1) +
  geom_point(mapping = aes(color = species, shape = Description)) +
  #geom_jitter(mapping = aes(color = species), shape = 1) +
  coord_fixed (3) +
  facet_grid( ~species) +
  scale_color_hue (l = 40) +
   scale_shape_manual(values = c(1:2)) +
  labs(x = "Codon bias relative to genus usage rules",
       y = "G+C ratio",
       title = "G+C ratio as a function of codon adaptiveness (genus rules)",
       subtitle = "grey icons = all genes \n colored icons = most and least adapted genes ") +
  theme_bw() +
  theme(plot.title.position = "plot",
        plot.caption.position = "panel",
        plot.title = element_text(face = "bold",
                                  size = 8, hjust = 0),
        plot.subtitle = element_text(size = 8),
        legend.title = element_text(size = 8),
        axis.title = element_text(size = 8),
        axis.text = element_text(size = 8),
        axis.text.x = element_text(angle = 45, hjust = 1))

Non-Genus CAI Plot

# tbl <- pivot_longer(tbl,
#              cols = c("species_CAI", "outgroup_CAI"),
#              names_to = "value",
#              values_to = "CAI"
#              )
ggplot(tbl, aes(outgroup_CAI,GC, Description)) +
  geom_point(mapping = aes(color = species, shape = Description)) +
  #geom_jitter(mapping = aes(color = species), shape = 1) +
  coord_equal()+
  facet_wrap( ~species) +
  scale_color_hue (l = 40) +
  scale_shape_manual(values = c(1:2)) +
  labs(x = "Codon bias relative to non-genus usage rules",
       y = "G+C ratio",
       title = "G+C ratio as a function of codon adaptiveness (non-genus rules)") +
  theme_bw() +
  theme(plot.title.position = "plot",
        plot.caption.position = "panel",
        plot.title = element_text(face = "bold",
                                  size = 8, hjust = 0),
        plot.subtitle = element_text(size = 8),
        legend.title = element_text(size = 8),
        axis.title = element_text(size = 8),
        axis.text = element_text(size = 8))

RNA-seq Expression of Top/Bottom % Codon Adapted Genes

Do genes with the top and bottom 2% of CAI values show particularly high or low expression levels? We might expect that genes with the top 2% of CAI values, where the CAI is based on highly expressed S. ratti ESTs might tend to show higher expression levels.

temp <- c(Ss = '../Data/SsRNAseq_log2cpm_filtered_norm_voom.csv',
          Sr = '../Data/SrRNAseq_log2cpm_filtered_norm_voom.csv',
          Sp = '../Data/SpRNAseq_log2cpm_filtered_norm_voom.csv',
          Sv = '../Data/SvRNAseq_log2cpm_filtered_norm_voom.csv')
dat.expression <- sapply(temp, read.csv, 
                      header = TRUE)
species_names <- names(dat.expression)

dat.Top.expression <- lapply(species_names, function(x) {
    dat.Top.select <- dplyr::filter(dat.Top, species == x) %>%
        dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) 
    
    subset.expression <- dat.expression[[x]] %>%
        dplyr::filter(X %in% dat.Top.select$geneID) %>%
        dplyr::rename(geneID = X) %>%
        pivot_longer(cols = -geneID,
                     names_to = "samples",
                     values_to = "expression") %>%
    tidyr::separate(samples,c("stage", NA)) %>%
    dplyr::group_by(geneID, stage) %>%
    dplyr::summarize(expression = median(expression))
})
names(dat.Top.expression) <- species_names
dat.Top.expression <- map_dfr(dat.Top.expression, bind_rows, .id = "species")


dat.Bot.expression <- lapply(species_names, function(x) {
    dat.Bot.select <- dplyr::filter(dat.Bot, species == x) %>%
        dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) 
    
    subset.expression <- dat.expression[[x]] %>%
        dplyr::filter(X %in% dat.Bot.select$geneID) %>%
        dplyr::rename(geneID = X) %>%
        pivot_longer(cols = -geneID,
                     names_to = "samples",
                     values_to = "expression") %>%
    tidyr::separate(samples,c("stage", NA)) %>%
    dplyr::group_by(geneID, stage) %>%
    dplyr::summarize(expression = median(expression))
})
names(dat.Bot.expression) <- species_names
dat.Bot.expression <- map_dfr(dat.Bot.expression, bind_rows, .id = "species")

dat.expression <- lapply(species_names, function(x) {
    dat.expression[[x]] %>%
        dplyr::rename(geneID = X) %>%
        pivot_longer(cols = -geneID,
                     names_to = "samples",
                     values_to = "expression") %>%
    tidyr::separate(samples,c("stage", NA)) %>%
    dplyr::group_by(geneID, stage) %>%
    dplyr::summarize(expression = median(expression))
})

names(dat.expression) <- species_names
dat.expression <- map_dfr(dat.expression, bind_rows, .id = "species")

Comparing expression in 2% highest and lowest CAI

lapply(species_names, function(x){
  plotdata.subset1 <- filter(dat.Top.expression, species == x)
  plotdata.subset2 <- filter(dat.Bot.expression, species == x)
  plotdata.all <- filter(dat.expression, species == x)
  
  plot.df <- bind_rows(HighCAI = plotdata.subset1,
                       LowCAI = plotdata.subset2,
                       AllGenes = plotdata.all, .id = "Description")
 
    p.Bot.expression <- ggplot(plot.df) +
        aes(x=stage, y=expression, color = Description, fill = Description) +
      geom_violin(show.legend = T, 
                    alpha = .5,
                   position =  position_dodge(width = 1),
                    trim = FALSE) +
      scale_color_manual(values = c("black", "coral4", "steelblue4")) +
      scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
        stat_summary(fun = "median", 
                     geom = "point", 
                     shape = 20, 
                     size = 2,  
                     position =  position_dodge(width = 1),
                     show.legend = FALSE) +
      facet_grid(~stage, scales = "free") +
        labs(y="log2CPM Expression", x = "Life Stage",
             title = paste0(x, " - log2 Counts per Million (CPM)"),
             caption=paste0("produced on ", Sys.time())) +
        theme_bw() 
    
    suppressMessages(ggsave(paste0(x,"_CAIExpression.pdf"),
                            plot = p.Bot.expression,
                            device = "pdf",
                            height = 4,
                            width = 7,
                            path = output.path,
                            useDingbats=FALSE))
    
    p.Bot.expression
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

CAI as a function of expression in free-living females

Does CAI value correspond with expression in free-living females

temp <- lapply(species_names, function(x) {
    dat.select <- dplyr::filter(species.specific.dat.df, 
                                    species == x) %>%
        dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) %>%
      dplyr::ungroup() %>%
      dplyr::select(geneID, species_CAI)
    
    subset.expression <- dat.expression %>%
      dplyr::filter(species == x)%>%
      dplyr::filter(stage == "FLF") %>%
      dplyr::filter(geneID %in% dat.select$geneID)
    
    full_join(dat.select, subset.expression, by = "geneID") %>%
      dplyr::select(!stage)
})
names(temp) <- species_names

caiExp.df <- map_dfr(temp, bind_rows, .id = "species")

caiExp.df <- bind_rows(AllGenes = caiExp.df,
                       HighCAI =  dplyr::filter(caiExp.df, geneID %in% dat.Top.expression$geneID),
                       LowCAI =  dplyr::filter(caiExp.df, geneID %in% dat.Bot.expression$geneID),
                       .id = "Description") %>%
  dplyr::mutate(species = as_factor(species))

caiExp.highExp_plot <- ggplot(caiExp.df) +
   aes(x=Description, y=expression, color = Description, fill = Description) +
      geom_violin(show.legend = T, 
                   position =  position_dodge(width = 1),
                    trim = FALSE,
                  color = 'black') +
  
      #scale_color_manual(values = c("black", "coral4", "steelblue4")) +
      scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
        stat_summary(fun = "median", 
                     geom = "point", 
                     shape = 20, 
                     size = 2,  
                     position =  position_dodge(width = 1),
                     color = "black",
                     show.legend = FALSE) +
  facet_grid( ~species) +
  labs(y="log2CPM Expression", x = "Gene subsets (base on CAI value) by species",
             title =  "log2 Counts per Million (CPM) in free-living female as a function of codon bias",
       subtitle = "grey = all genes \ncolors = top/bottom 2% CAI",
             caption=paste0("produced on ", Sys.time())) +
        theme_bw() +
  theme(plot.title.position = "plot",
        plot.caption.position = "panel",
        plot.title = element_text(face = "bold",
                                  size = 8, hjust = 0),
        plot.subtitle = element_text(size = 8),
        legend.title = element_text(size = 8),
        axis.title = element_text(size = 8),
        axis.text = element_text(size = 8),
        axis.text.x = element_text(angle = 45, hjust = 1))

caiExp.highExp_plot

2-way ANOVA of expression

suppressPackageStartupMessages({library(car)})
cpm.merged <- rbind(dat.expression %>% dplyr::mutate(condition = "AllGenes"),
                    dat.Top.expression %>% dplyr::mutate(condition = "HighCPM"),
                    dat.Bot.expression %>% dplyr::mutate(condition = "LowCPM")) %>%
    dplyr::mutate(condition = as_factor(condition))%>%
  group_by(species, geneID, condition, stage)

options(contrasts = c("contr.sum", "contr.poly"))

lapply(species_names, function(x) {
cpm.merged.sub <- filter(cpm.merged, species == x)

res.aov2<- aov(expression ~ condition * stage, data = cpm.merged.sub)
Anova(res.aov2, type = "III")
res.aov3<-TukeyHSD(res.aov2, "condition:stage")
as_tibble(res.aov3$`condition:stage`, rownames = "comparison") %>%
  separate(col = comparison, into = c("comp1", "LS1", "comp2", "LS2")) %>%
  dplyr::filter(LS1 == LS2) %>%
  unite(temp1, comp1, LS1, sep = ":")%>%
  unite(temp2, comp2, LS2, sep = ":")%>%
  unite(comparison, temp1, temp2, sep = "-")

})
## [[1]]
## # A tibble: 21 x 5
##    comparison                   diff    lwr    upr  `p adj`
##    <chr>                       <dbl>  <dbl>  <dbl>    <dbl>
##  1 HighCPM:FLF-AllGenes:FLF    1.79   0.960  2.63  3.50e-12
##  2 LowCPM:FLF-AllGenes:FLF    -1.45  -2.41  -0.497 1.20e- 5
##  3 LowCPM:FLF-HighCPM:FLF     -3.25  -4.50  -1.99  2.38e-13
##  4 HighCPM:iL3-AllGenes:iL3    0.665 -0.168  1.50  3.43e- 1
##  5 LowCPM:iL3-AllGenes:iL3    -2.05  -3.00  -1.09  4.68e-12
##  6 LowCPM:iL3-HighCPM:iL3     -2.71  -3.97  -1.45  3.04e-12
##  7 HighCPM:iL3a-AllGenes:iL3a  1.10   0.266  1.93  4.83e- 4
##  8 LowCPM:iL3a-AllGenes:iL3a  -1.76  -2.71  -0.801 1.14e- 8
##  9 LowCPM:iL3a-HighCPM:iL3a   -2.86  -4.11  -1.60  3.64e-13
## 10 HighCPM:PF-AllGenes:PF      0.943  0.109  1.78  9.21e- 3
## # … with 11 more rows
## 
## [[2]]
## # A tibble: 12 x 5
##    comparison                 diff    lwr    upr  `p adj`
##    <chr>                     <dbl>  <dbl>  <dbl>    <dbl>
##  1 HighCPM:FLF-AllGenes:FLF  1.57   0.785  2.35  3.69e- 9
##  2 LowCPM:FLF-AllGenes:FLF  -4.00  -4.93  -3.08  0.      
##  3 LowCPM:FLF-HighCPM:FLF   -5.57  -6.77  -4.37  0.      
##  4 HighCPM:FLM-AllGenes:FLM  3.22   2.44   4.00  0.      
##  5 LowCPM:FLM-AllGenes:FLM  -2.72  -3.65  -1.80  8.69e-14
##  6 LowCPM:FLM-HighCPM:FLM   -5.94  -7.14  -4.74  0.      
##  7 HighCPM:iL3-AllGenes:iL3  0.607 -0.174  1.39  3.15e- 1
##  8 LowCPM:iL3-AllGenes:iL3  -2.77  -3.70  -1.85  1.23e-13
##  9 LowCPM:iL3-HighCPM:iL3   -3.38  -4.58  -2.18  1.36e-13
## 10 HighCPM:PF-AllGenes:PF    0.175 -0.605  0.956 1.00e+ 0
## 11 LowCPM:PF-AllGenes:PF    -3.93  -4.85  -3.01  0.      
## 12 LowCPM:PF-HighCPM:PF     -4.11  -5.31  -2.91  2.93e-14
## 
## [[3]]
## # A tibble: 18 x 5
##    comparison                       diff    lwr     upr  `p adj`
##    <chr>                           <dbl>  <dbl>   <dbl>    <dbl>
##  1 HighCPM:FLF-AllGenes:FLF        2.60   1.86   3.33   0.      
##  2 LowCPM:FLF-AllGenes:FLF        -0.806 -1.66   0.0522 9.58e- 2
##  3 LowCPM:FLF-HighCPM:FLF         -3.40  -4.52  -2.28   1.69e-13
##  4 HighCPM:flL1L2-AllGenes:flL1L2  3.67   2.94   4.41   0.      
##  5 LowCPM:flL1L2-AllGenes:flL1L2  -1.76  -2.62  -0.901  1.31e-10
##  6 LowCPM:flL1L2-HighCPM:flL1L2   -5.43  -6.55  -4.31   0.      
##  7 HighCPM:FLM-AllGenes:FLM        2.60   1.86   3.33   0.      
##  8 LowCPM:FLM-AllGenes:FLM        -1.50  -2.36  -0.642  1.60e- 7
##  9 LowCPM:FLM-HighCPM:FLM         -4.10  -5.22  -2.98   0.      
## 10 HighCPM:iL3-AllGenes:iL3        1.04   0.307  1.78   1.11e- 4
## 11 LowCPM:iL3-AllGenes:iL3        -1.53  -2.39  -0.669  8.02e- 8
## 12 LowCPM:iL3-HighCPM:iL3         -2.57  -3.69  -1.45   4.28e-13
## 13 HighCPM:PF-AllGenes:PF          1.92   1.18   2.65   2.29e-13
## 14 LowCPM:PF-AllGenes:PF          -0.938 -1.80  -0.0804 1.61e- 2
## 15 LowCPM:PF-HighCPM:PF           -2.85  -3.97  -1.73   1.29e-13
## 16 HighCPM:pL1L2-AllGenes:pL1L2    3.65   2.91   4.38   0.      
## 17 LowCPM:pL1L2-AllGenes:pL1L2    -1.75  -2.61  -0.897  1.48e-10
## 18 LowCPM:pL1L2-HighCPM:pL1L2     -5.40  -6.52  -4.28   0.      
## 
## [[4]]
## # A tibble: 6 x 5
##   comparison                 diff     lwr    upr  `p adj`
##   <chr>                     <dbl>   <dbl>  <dbl>    <dbl>
## 1 HighCPM:FLF-AllGenes:FLF  1.12   0.544   1.70  5.00e- 7
## 2 LowCPM:FLF-AllGenes:FLF  -1.55  -2.32   -0.789 1.05e- 7
## 3 LowCPM:FLF-HighCPM:FLF   -2.68  -3.63   -1.72  9.73e-14
## 4 HighCPM:PF-AllGenes:PF    0.493 -0.0869  1.07  1.48e- 1
## 5 LowCPM:PF-AllGenes:PF    -1.61  -2.37   -0.846 2.91e- 8
## 6 LowCPM:PF-HighCPM:PF     -2.10  -3.06   -1.15  4.66e- 9

GO Analysis: Top % Codon Adapted Genes

Take lists of the top percentile of codon adapted genes in each species, and run GO analysis to see if these genes are enriched for anything interesting.

# Carry out GO enrichment using gProfiler2 ----

run_GO <- function (data){
  GO.geneID <- data %>%
    ungroup() %>%
    group_by(species) %>%
    dplyr::group_map(~ {
      gost.res <- .x %>%
        dplyr::select(geneID, GC) %>%
        unique() 
    }, .keep = TRUE)
  
  names(GO.geneID) <- levels(data$species)
  
  gost.results <- lapply(names(GO.geneID), function(y){
    organism = case_when (y == "Ss" ~ "ststerprjeb528",
                          y == "Sr" ~ "strattprjeb125",
                          y == "Sp" ~ "stpapiprjeb525",
                          y == "Sv" ~ "stveneprjeb530",
                          y == "Ce" ~ "caelegprjna13758")
    gost.res <- gost(GO.geneID[[y]]$geneID,
                     organism = organism, 
                     evcodes = TRUE,
                     correction_method = "fdr")
  })
  names(gost.results) <- names(GO.geneID)
  gost.results
}

find_enrichments <- function (gost.results) {
  # Find GO terms that are enriched with p-values equal or small than a threshold.
  enriched.terms <- lapply(names(gost.results), function(y){
    gost.results[[y]]$result %>%
      as_tibble() %>%
      dplyr::select(term_id, p_value)
  }) 
  names(enriched.terms) <- names(gost.results)
  enriched.terms
}

find_common_enrichments <- function (data, p.val,n_groups){
  data %>%
    dplyr::filter(p_value <= p.val) %>%
    group_by(term_id) %>%
    summarize(n= n()) %>%
    dplyr::filter(n >= n_groups)
}

gost.results.top <- run_GO(dat.Top)
enriched.terms.top <- find_enrichments(gost.results.top)
highTerms.overlap.top <- enriched.terms.top %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 4)

Interactive Manhattan Plots

S. stercoralis

gostplot(gost.results.top$Ss, interactive = T, capped = T) 

S. ratti

gostplot(gost.results.top$Sr, interactive = T, capped = T) 

S. papillosus

gostplot(gost.results.top$Sp, interactive = T, capped = T) 

S. venezuelensis

gostplot(gost.results.top$Sv, interactive = T, capped = T) 

C. elegans

gostplot(gost.results.top$Ce, interactive = T, capped = T) 

Common GO Terms

GO terms that are commonly enriched in the top percentile of codon adapted genes in all Strongyloides species tested:

dplyr::filter(gost.results.top$Ss$result, term_id %in% highTerms.overlap.top$term_id) %>%
  dplyr::select(term_id, term_name, intersection)

How many of the common Strongyloides go terms are found in the C. elegans terms?

dplyr::filter(gost.results.top$Ce$result, term_id %in% highTerms.overlap.top$term_id) %>%
  dplyr::select(term_id, term_name)

How many are unique to the Strongyloides species?

temp <- dplyr::filter(highTerms.overlap.top, 
                      !term_id %in% gost.results.top$Ce$result$term_id)
dplyr::filter(gost.results.top$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

GO Analysis: Bottom % Codon Adapted Genes

Take lists of the bottom percentile of codon adapted genes in each species, and run GO analysis to see if these genes are enriched for anything interesting.

# Carry out GO enrichment using gProfiler2 ----
gost.results.bot <- run_GO(dat.Bot)
enriched.terms.bot <- find_enrichments(gost.results.bot)
highTerms.overlap.bot <- enriched.terms.bot %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 4)

Interactive Manhattan Plots

S. stercoralis

gostplot(gost.results.bot$Ss, interactive = T, capped = T) 

S. ratti

gostplot(gost.results.bot$Sr, interactive = T, capped = T) 

S. papillosus

gostplot(gost.results.bot$Sp, interactive = T, capped = T) 

S. venezuelensis

gostplot(gost.results.bot$Sv, interactive = T, capped = T) 

C. elegans

gostplot(gost.results.bot$Ce, interactive = T, capped = T) 

Common GO Terms

GO terms that are commonly enriched in the bottom percentile of codon adapted genes in all Strongyloides species tested:

dplyr::filter(gost.results.bot$Ss$result, term_id %in% highTerms.overlap.bot$term_id) %>%
  dplyr::select(term_id, term_name)

How many of the common Strongyloides go terms are found in the C. elegans terms?

dplyr::filter(gost.results.bot$Ce$result, term_id %in% highTerms.overlap.bot$term_id) %>%
  dplyr::select(term_id, term_name)

How many are unique to the Strongyloides species?

temp <- dplyr::filter(highTerms.overlap.bot, 
                      !term_id %in% gost.results.bot$Ce$result$term_id)
dplyr::filter(gost.results.bot$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

GO Analysis: AT-rich Strongyloides sequences

dat.ATrich <- species.specific.dat.df %>%
  #dplyr::filter(species != "Ce") %>%
  dplyr::mutate(Description = factor("Top")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::slice_max(order_by = desc(GC), prop = percentile) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))

How many genes in the AT-rich subset are also included in the subset of genes highly adapted for Strongyloides expression?

common.sequences <- inner_join(dat.ATrich, dat.Top) %>%
  tally()

tally(dat.Top) %>%
  dplyr::filter(species != "Ce") %>%
  dplyr::mutate(ratio_shared_seq = common.sequences$n/n)
# Go analysis
gost.results.ATrich <- run_GO(dat.ATrich)
enriched.terms.ATrich <- find_enrichments(gost.results.ATrich)
highTerms.overlap.ATrich <- enriched.terms.ATrich %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 4)

What GO terms are enriched in AT-rich genes?

dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% highTerms.overlap.ATrich$term_id) %>%
  dplyr::select(term_id, term_name, intersection)

How many GO terms are enriched in both AT-rich sequences and most Str-codon-adapted sequences?

#
temp <- dplyr::filter(highTerms.overlap.top, 
                      term_id %in% highTerms.overlap.ATrich$term_id)
dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

How many GO terms are enriched in both AT-rich sequences and least Str-codon-adapted sequences?

temp <- dplyr::filter(highTerms.overlap.bot, 
                      term_id %in% highTerms.overlap.ATrich$term_id)
dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

GO Analysis: GC-rich Strongyloides sequences

dat.GCrich <- species.specific.dat.df %>%
  #dplyr::filter(species != "Ce") %>%
  dplyr::mutate(Description = factor("Bot")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::slice_max(order_by = GC, prop = percentile) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))

How many genes in the GC-rich subset are also included in the subset of genes poorly adapted for Strongyloides expression?

common.sequences <- inner_join(dat.GCrich, dat.Bot) %>%
  dplyr::filter(species != "Ce") %>%
  tally()

tally(dat.Bot) %>%
  dplyr::filter(species != "Ce") %>%
  dplyr::mutate(ratio_shared_seq = common.sequences$n/n)
# Go analysis
gost.results.GCrich <- run_GO(dat.GCrich)
enriched.terms.GCrich <- find_enrichments(gost.results.GCrich)
highTerms.overlap.GCrich <- enriched.terms.GCrich %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 4)

What GO terms are enriched in GC-rich genes?

dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% highTerms.overlap.GCrich$term_id) %>%
  dplyr::select(term_id, term_name, intersection)

How many GO terms are enriched in both GC-rich sequences and least Str-codon-adapted sequences?

#Overlap btwn GO terms enriched in GC-rich sequences and least Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot, 
                      term_id %in% highTerms.overlap.GCrich$term_id)
dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

How many GO terms are enriched in both GC-rich sequences and highest Str-codon-adapted sequences?

#Overlap btwn GO terms enriched in GC-rich sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top, 
                      term_id %in% highTerms.overlap.GCrich$term_id)
dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

GO Analysis: Top % Gene Expression

Generate lists of genes with the highest expression in free-living females in each species, and run GO analysis to see if these genes are enriched for anything interesting. Here, common GO terms are defined as terms that appear in 3 or 4 species, because the results from S. papillosus are divergent from the others.

gost.results.highexp <- lapply(species_names, function(x) {
  
   GO.geneID <- dat.expression %>%
     ungroup() %>%
     dplyr::filter(species == x) %>%
    dplyr::filter(stage == "FLF") %>%
    dplyr::slice_max(order_by = expression, prop = percentile) 
  
  organism = case_when (x == "Ss" ~ "ststerprjeb528",
                          x == "Sr" ~ "strattprjeb125",
                          x == "Sp" ~ "stpapiprjeb525",
                          x == "Sv" ~ "stveneprjeb530",
                          x == "Ce" ~ "caelegprjna13758")
   
  gost.res <- gost(GO.geneID$geneID,
                     organism = organism, 
                     evcodes = TRUE,
                     correction_method = "fdr")
  
  })
names(gost.results.highexp) <- species_names
    
enriched.terms.highexp <- find_enrichments(gost.results.highexp)
highTerms.overlap.highexp <- enriched.terms.highexp %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 3)

What GO terms are commonly enriched in highly expressed genes in 3 of 4 species?

dplyr::filter(gost.results.highexp$Ss$result, term_id %in% highTerms.overlap.highexp$term_id) %>%
  dplyr::select(term_id, term_name, intersection)

How many GO terms are enriched in both highly expressed sequences and highest Str-codon-adapted sequences?

#Overlap btwn GO terms enriched in highly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top, 
                      term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

How many GO terms are enriched in both highly expressed sequences and poorest Str-codon-adapted sequences?

#Overlap btwn GO terms enriched in highly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot, 
                      term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

GO Analysis: Bottom % Gene Expression

Generate lists of genes with lowest expression in free-living females in each species, and run GO analysis to see if these genes are enriched for anything interesting. Here, common GO terms are defined as terms that appear in 3 or 4 species, because the results from S. papillosus are divergent from the others.

gost.results.lowexp <- lapply(species_names, function(x) {
  
   GO.geneID <- dat.expression %>%
     ungroup() %>%
     dplyr::filter(species == x) %>%
    dplyr::filter(stage == "FLF") %>%
    dplyr::slice_max(order_by = desc(expression), prop = percentile) 
  
  organism = case_when (x == "Ss" ~ "ststerprjeb528",
                          x == "Sr" ~ "strattprjeb125",
                          x == "Sp" ~ "stpapiprjeb525",
                          x == "Sv" ~ "stveneprjeb530",
                          x == "Ce" ~ "caelegprjna13758")
   
  gost.res <- gost(GO.geneID$geneID,
                     organism = organism, 
                     evcodes = TRUE,
                     correction_method = "fdr")
  
  })
names(gost.results.lowexp) <- species_names
    
enriched.terms.lowexp <- find_enrichments(gost.results.lowexp)
highTerms.overlap.lowexp <- enriched.terms.lowexp %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 3)

What GO terms are commonly enriched in poorly expressed genes in 3 of 4 species?

dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% highTerms.overlap.lowexp$term_id) %>%
  dplyr::select(term_id, term_name, intersection)

How many GO terms are enriched in both poorly expressed sequences and highest Str-codon-adapted sequences?

#Overlap btwn GO terms enriched in poorly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top, 
                      term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

How many GO terms are enriched in both poorly expressed sequences and poorest Str-codon-adapted sequences?

#Overlap btwn GO terms enriched in poorly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot, 
                      term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

Save Files and Plots

Saving data and plots generated by the above code.

# Save distribution plot of codon adaptivness by species
ggsave('../Outputs/Codon_Adaptiveness_Distributions_by_Species.pdf', 
       plot = species_adaptiveness_plot, 
       width = 5.5, height = 4, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

# Save distribution plot of GC content by species
ggsave('../Outputs/GC_Distributions_by_Species.pdf', 
       plot = species_GC_plot, 
       width = 5.5, height = 4, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

# Save plot of CAI vs FLF expression level
ggsave('../Outputs/CAIvsFLFlog2CPM_by_Species.pdf', 
       plot = caiExp.highExp_plot, 
       width = 5.5, height = 3, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

write_excel <- function(data, sheet_facet, filename){
  
  file <- paste(filename,".xlsx",sep = "")
  # Workbook
  to_download <<- createWorkbook()
  
  sapply(seq_along(sheet_facet), function(x){
    addWorksheet(wb = to_download, sheetName = sheet_facet[[x]])
    
    sheet_data <- data %>%
      dplyr::filter(species == sheet_facet[[x]]) %>%
      dplyr::select(!species)
    # # Write Data
    # ## Sheet header
    # writeData(
    #   to_download,
    #   sheet = 1,
    #   x = c(
    #     paste0("Strongyloides Codon Usage Report"),
    #     paste0("Report generated on ", format(Sys.Date(), "%B %d, %Y"))
    #   )
    # )
    # 
    ## Results of codon usage analysis
    writeData(
      to_download,
      sheet = x,
      x = sheet_data,
      startRow = 1,
      startCol = 1,
      headerStyle = createStyle(
        textDecoration = "Bold",
        halign = "center",
        border = "bottom"
      )
    )
    
  })
  saveWorkbook(to_download, file, overwrite=TRUE)
}  

# Save files with top and bottom percentiles of codon adapted genes per species
tbl %>%
      dplyr::select(geneID, species_CAI, GC, Description, species) %>%
  write_excel(levels(tbl$species), '../Outputs/Percentile_CAI_Genes')

# Save list of all GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes
enriched.GO.terms <- rbind(
  lowCAI = lapply(names(gost.results.bot), function(y){
  gost.results.bot[[y]]$result %>%
    as_tibble() %>%
    dplyr::select(-c(query, significant,parents)) %>%
    dplyr::arrange(p_value) %>%
    dplyr::mutate(species = y)
}) %>% bind_rows %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
  dplyr::mutate(Description = "Least-adapted genes")  %>%
  dplyr::relocate(Description,species,term_id, source, term_name, .before = everything()),
highCAI = lapply(names(gost.results.top), function(y){
  gost.results.top[[y]]$result %>%
    as_tibble() %>%
    dplyr::select(-c(query, significant,parents)) %>%
    dplyr::arrange(p_value) %>%
    dplyr::mutate(species = y)
}) %>% bind_rows %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
  dplyr::mutate(Description = "Most-adapted genes")  %>%
  dplyr::relocate(Description,species,term_id, source, term_name, .before = everything()) 
)

write_excel(enriched.GO.terms, levels(enriched.GO.terms$species), '../Outputs/Percentile_GO_Analysis')

# Save list of common GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes and GT-rich and AT-rich subsets too
common.terms <- rbind (lowCAI = filter(enriched.GO.terms, term_id %in%  highTerms.overlap.bot$term_id) %>%
         dplyr::filter(species == "Ss") %>%
         dplyr::select(term_id, source,term_name, Description),
       highCAI = filter(enriched.GO.terms, term_id %in%  highTerms.overlap.top$term_id) %>%
         dplyr::filter(species == "Ss") %>%
         dplyr::select(term_id, source, term_name, Description),
       ATrich =  dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% highTerms.overlap.ATrich$term_id) %>%
         dplyr::select(term_id, source, term_name) %>%
         dplyr::mutate(Description = "AT-rich genes"),
       GCrich = dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% highTerms.overlap.GCrich$term_id) %>%
         dplyr::select(term_id,  source, term_name)%>%
         dplyr::mutate(Description = "GC-rich genes")
) %>%
  dplyr::mutate(Description = factor(Description)) %>%
  dplyr::rename(species = Description)

  write_excel(common.terms, levels(common.terms$species), '../Outputs/Commonly_Enriched_GO_Terms')

# Save Functional Enrichment Plots for bottom percentile
lapply(names(gost.results.bot), function(y){
  gost.res <- gost.results.bot[[y]]
  
  gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
    labs(title = paste(y, "Bottom", percentile*100,"% Codon Adapted Genes")) +
    theme(plot.title.position = "plot",
          plot.caption.position = "plot",
          plot.title = element_text(face = "bold",
                                    size = 8, hjust = 0))
  gost.ggplot.pub<- publish_gostplot(gost.ggplot,
                                     highlight_terms = highTerms.overlap.bot$term_id)
  ggsave(paste0('../Outputs/',y,'_Bot_CAI_Plot.pdf'), plot = gost.ggplot.pub,
         width = 7, height = 4,
         units = "in", device = "pdf",
         useDingbats = FALSE)
  
})

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
# Save Functional Enrichment Plots for top percentile
lapply(names(gost.results.top), function(y){
  gost.res <- gost.results.top[[y]]
  
  gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
    labs(title = paste(y, "Top", percentile*100,"% Codon Adapted Genes")) +
    theme(plot.title.position = "plot",
          plot.caption.position = "plot",
          plot.title = element_text(face = "bold",
                                    size = 8, hjust = 0))
  gost.ggplot.pub<- publish_gostplot(gost.ggplot,
                                     highlight_terms = highTerms.overlap.top$term_id)
  ggsave(paste0('../Outputs/',y,'_Top_CAI_Plot.pdf'), plot = gost.ggplot.pub,
         width = 7, height = 5,
         units = "in", device = "pdf",
         useDingbats = FALSE)
  
})

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL

Appendix: All code for this report

suppressPackageStartupMessages({
  library(knitr)
  library(rmarkdown)
  library(tidyverse)
  library(readxl)
  library(magrittr)
  library(DescTools)
  library(ggthemes)
  library(biomaRt)
  library(ggplot2)
  library(openxlsx)
  library(wrapr)
  library(Matching)
  library(cowplot)
  library(gprofiler2)
  source('fetch_cDNAseq.R')
})
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

# Check for presence of output plots folder, generate if it doesn't exist
output.path <- "../Outputs"
if (!dir.exists(output.path)){
  dir.create(output.path)
}

if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
  
  
  temp <- c(Ss = '../Data/Ss_Codon_Usage_Report.xlsx',
            Sr = '../Data/Sr_Codon_Usage_Report.xlsx',
            Sp = '../Data/Sp_Codon_Usage_Report.xlsx',
            Sv = '../Data/Sv_Codon_Usage_Report.xlsx',
            Bm = '../Data/Bm_Codon_Usage_Report.xlsx',
            Ce = '../Data/Ce_Codon_Usage_Report.xlsx')
  dat.genomic <- sapply(temp, read_excel, 
                        sheet = 1, 
                        skip = 3, 
                        col_names = T, 
                        simplify = F, 
                        USE.NAMES = T)
  # This data contains transcript level information. 
  # Commented code would trim rows such that in cases where there are 
  # multiple transcripts per gene, only  
  # values from the longest transcript would be included.
  # This was originally included to permit comparison to gene expression databases
  # but is no longer required.
  dat.genomic.cleaned <- lapply(dat.genomic, function(x){
    x %>%
      dplyr::mutate(transcriptID = geneID)
    # dplyr::rename(transcriptID = geneID) %>%
    # dplyr::mutate(geneID = str_remove_all(transcriptID, "\\.[0-9]$")) %>%
    # dplyr::mutate(geneID = str_remove_all(geneID, "[a-c]$")) %>%
    # dplyr::group_by(geneID) %>%
    # dplyr::slice_max(n = 1, order_by = str_length(`cDNA sequence`), 
    #                   with_ties = FALSE) %>%
    #dplyr::relocate(geneID, .before = everything())
    
  })
  names(dat.genomic.cleaned) <- c("Ss", "Sr", "Sp", "Sv", "Bm", "Ce")
  
}
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("geneID") %>%
    unlist() %>%
    as_tibble_col(column_name = "geneID") %>%
    group_by(geneID)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("transcriptID") %>%
    unlist() %>%
    as_tibble_col(column_name = "transcriptID") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("Sr_CAI") %>%
    unlist() %>%
    as_tibble_col(column_name = "Sr_CAI") %>%
    add_column(dat.genomic.df, .)
  
   dat.genomic.df <- dat.genomic.cleaned %>%
    map("Bm_CAI") %>%
    unlist() %>%
    as_tibble_col(column_name = "Bm_CAI") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("Ce_CAI") %>%
    unlist() %>%
    as_tibble_col(column_name = "Ce_CAI") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("GC") %>%
    unlist() %>%
    as_tibble_col(column_name = "GC") %>%
    add_column(dat.genomic.df, .)
  
  # Add the column that species which species the gene is from
  # by matching to the original lists of genes, 
  # the group
  dat.genomic.df <- dat.genomic.df %>%
    dplyr::mutate(species = case_when(
      transcriptID %in% dat.genomic$Ss$geneID ~ 'Ss',
      transcriptID %in% dat.genomic$Sr$geneID ~ 'Sr',
      transcriptID %in% dat.genomic$Sp$geneID ~ 'Sp',
      transcriptID %in% dat.genomic$Sv$geneID ~ 'Sv',
      transcriptID %in% dat.genomic$Bm$geneID ~ 'Bm',
      transcriptID %in% dat.genomic$Ce$geneID ~ 'Ce',
    ))
  dat.genomic.df <- group_by(dat.genomic.df, species)
  
}

# Save a tidy version of the complied genomic CAI value dataset if it doesn't exist
if (!file.exists(c("../Data/tidy_genomic_CAI.csv"))) {
  write.table(dat.genomic.df, 
              file = "../Data/tidy_genomic_CAI.csv", 
              sep = ",", 
              col.names = T, 
              row.names = FALSE)
  
}
dat.genomic.df <- suppressWarnings(
  read.csv("../Data/tidy_genomic_CAI.csv", 
           header = TRUE)) %>%
  as_tibble() 

species.specific.dat.df <- dat.genomic.df %>%
  dplyr::mutate(species_CAI = case_when(
    species == 'Ce' ~ Ce_CAI,
    species == 'Bm' ~ Bm_CAI,
    TRUE ~ Sr_CAI
  )) %>%
  dplyr::mutate(outgroup_CAI = case_when(
    species == 'Ce' ~ Sr_CAI,
    species != 'Ce' ~ Ce_CAI
  )) %>%
  #dplyr:: select(geneID, species_CAI, species) %>%
  group_by(species)


plot_distributions <- function(dat,
                               x,
                               title,
                               xlabel,
                               caption = "",
                               ylabel = "Density (scaled to maximum of 1)",
                               xlim = c(0,1),
                               ylim = c(0,1)) {
  dat %>%
    ggplot(aes(x = x, y = after_stat(ndensity), color = species)) +
    geom_freqpoly(bins = 50) +
    scale_color_manual(values = c("seagreen4", "grey4","steelblue4", 
                                  "coral4", "darkgoldenrod4", 
                                  "darkorchid4"),
                       name = "Species",
                       labels = c("B. malayi","C. elegans", "S. papillosus", 
                                  "S. ratti", "S. stercoralis",
                                  "S. venezuelensis")) +
    labs(title = title,
         subtitle = "Data includes all predicted coding sequences",
         x = xlabel,
         y = ylabel,
         caption = caption) +
    coord_equal(xlim = xlim, ylim = ylim) +
    theme_bw() +
    theme(plot.title.position = "plot",
          plot.caption.position = "panel",
          plot.title = element_text(face = "bold",
                                    size = 10, hjust = 0),
          plot.subtitle = element_text(size = 8),
          legend.title = element_text(size = 8),
          axis.title = element_text(size = 8),
          axis.text = element_text(size = 8),
          axis.text.x = element_text(angle = 45, hjust = 1))
}
species_adaptiveness_plot <- plot_distributions(species.specific.dat.df,
                                                species.specific.dat.df$species_CAI,
                                                "Species-wide codon adaptiveness",
                                                "Codon bias relative to \n genus coding usage rules (CAI)",
                                                "Strongyloides species values relative to S. ratti usage rules
       C. elegans values relative to C. elegans usage rules.
       B. malayi values relative to B. malayi ussage rules.")

species_adaptiveness_plot
# Kruskal-Wallis testing of genomic CAI
one.way <- kruskal.test(species_CAI ~ species, data = species.specific.dat.df)

# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.dat.df$species_CAI, species.specific.dat.df$species, method = "bonferroni")

species.specific.GC.df <- dat.genomic.df %>%
  dplyr:: select(geneID, GC, species) %>%
  group_by(species)



species_GC_plot <- plot_distributions(species.specific.GC.df,
                                      species.specific.GC.df$GC,
                                      "Species-wide GC content",
                                      "GC content")

species_GC_plot
# Kruskal-Wallis testing of genomic GC
one.way <- kruskal.test(GC ~ species, data = species.specific.GC.df)

# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.GC.df$GC, species.specific.GC.df$species, method = "bonferroni")

percentile = 0.02

# For following analyses, only look at Strongyloides and C. elegans, not B. malayi
species.specific.dat.df<-species.specific.dat.df %>%
  dplyr::filter(species != "Bm")

dat.Top <- species.specific.dat.df %>%
  #dplyr::filter(species != "Ce") %>%
  dplyr::mutate(Description = factor("Top")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::slice_max(order_by = species_CAI, prop = percentile) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))

tally(dat.Top)

dat.Bot <- species.specific.dat.df %>%
  # dplyr::filter(species != "Ce") %>%
  dplyr::mutate(Description = factor("Bot")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::slice_max(order_by = desc(species_CAI), prop = percentile) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))

tally(dat.Bot)
tbl <- rbind(dat.Top, dat.Bot) %>%
  mutate(Description = recode(Description, Top = "Most-adapted genes",
                              Bot = "Least-adapted genes"))

ggplot(tbl, aes(species_CAI,GC, Description)) +
  geom_point(data = species.specific.dat.df,
             mapping = aes(x = species_CAI, y = GC),color = "grey", shape = 1) +
  geom_point(mapping = aes(color = species, shape = Description)) +
  #geom_jitter(mapping = aes(color = species), shape = 1) +
  coord_fixed (3) +
  facet_grid( ~species) +
  scale_color_hue (l = 40) +
   scale_shape_manual(values = c(1:2)) +
  labs(x = "Codon bias relative to genus usage rules",
       y = "G+C ratio",
       title = "G+C ratio as a function of codon adaptiveness (genus rules)",
       subtitle = "grey icons = all genes \n colored icons = most and least adapted genes ") +
  theme_bw() +
  theme(plot.title.position = "plot",
        plot.caption.position = "panel",
        plot.title = element_text(face = "bold",
                                  size = 8, hjust = 0),
        plot.subtitle = element_text(size = 8),
        legend.title = element_text(size = 8),
        axis.title = element_text(size = 8),
        axis.text = element_text(size = 8),
        axis.text.x = element_text(angle = 45, hjust = 1))



# tbl <- pivot_longer(tbl,
#              cols = c("species_CAI", "outgroup_CAI"),
#              names_to = "value",
#              values_to = "CAI"
#              )
ggplot(tbl, aes(outgroup_CAI,GC, Description)) +
  geom_point(mapping = aes(color = species, shape = Description)) +
  #geom_jitter(mapping = aes(color = species), shape = 1) +
  coord_equal()+
  facet_wrap( ~species) +
  scale_color_hue (l = 40) +
  scale_shape_manual(values = c(1:2)) +
  labs(x = "Codon bias relative to non-genus usage rules",
       y = "G+C ratio",
       title = "G+C ratio as a function of codon adaptiveness (non-genus rules)") +
  theme_bw() +
  theme(plot.title.position = "plot",
        plot.caption.position = "panel",
        plot.title = element_text(face = "bold",
                                  size = 8, hjust = 0),
        plot.subtitle = element_text(size = 8),
        legend.title = element_text(size = 8),
        axis.title = element_text(size = 8),
        axis.text = element_text(size = 8))


temp <- c(Ss = '../Data/SsRNAseq_log2cpm_filtered_norm_voom.csv',
          Sr = '../Data/SrRNAseq_log2cpm_filtered_norm_voom.csv',
          Sp = '../Data/SpRNAseq_log2cpm_filtered_norm_voom.csv',
          Sv = '../Data/SvRNAseq_log2cpm_filtered_norm_voom.csv')
dat.expression <- sapply(temp, read.csv, 
                      header = TRUE)
species_names <- names(dat.expression)

dat.Top.expression <- lapply(species_names, function(x) {
    dat.Top.select <- dplyr::filter(dat.Top, species == x) %>%
        dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) 
    
    subset.expression <- dat.expression[[x]] %>%
        dplyr::filter(X %in% dat.Top.select$geneID) %>%
        dplyr::rename(geneID = X) %>%
        pivot_longer(cols = -geneID,
                     names_to = "samples",
                     values_to = "expression") %>%
    tidyr::separate(samples,c("stage", NA)) %>%
    dplyr::group_by(geneID, stage) %>%
    dplyr::summarize(expression = median(expression))
})
names(dat.Top.expression) <- species_names
dat.Top.expression <- map_dfr(dat.Top.expression, bind_rows, .id = "species")


dat.Bot.expression <- lapply(species_names, function(x) {
    dat.Bot.select <- dplyr::filter(dat.Bot, species == x) %>%
        dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) 
    
    subset.expression <- dat.expression[[x]] %>%
        dplyr::filter(X %in% dat.Bot.select$geneID) %>%
        dplyr::rename(geneID = X) %>%
        pivot_longer(cols = -geneID,
                     names_to = "samples",
                     values_to = "expression") %>%
    tidyr::separate(samples,c("stage", NA)) %>%
    dplyr::group_by(geneID, stage) %>%
    dplyr::summarize(expression = median(expression))
})
names(dat.Bot.expression) <- species_names
dat.Bot.expression <- map_dfr(dat.Bot.expression, bind_rows, .id = "species")

dat.expression <- lapply(species_names, function(x) {
    dat.expression[[x]] %>%
        dplyr::rename(geneID = X) %>%
        pivot_longer(cols = -geneID,
                     names_to = "samples",
                     values_to = "expression") %>%
    tidyr::separate(samples,c("stage", NA)) %>%
    dplyr::group_by(geneID, stage) %>%
    dplyr::summarize(expression = median(expression))
})

names(dat.expression) <- species_names
dat.expression <- map_dfr(dat.expression, bind_rows, .id = "species")

lapply(species_names, function(x){
  plotdata.subset1 <- filter(dat.Top.expression, species == x)
  plotdata.subset2 <- filter(dat.Bot.expression, species == x)
  plotdata.all <- filter(dat.expression, species == x)
  
  plot.df <- bind_rows(HighCAI = plotdata.subset1,
                       LowCAI = plotdata.subset2,
                       AllGenes = plotdata.all, .id = "Description")
 
    p.Bot.expression <- ggplot(plot.df) +
        aes(x=stage, y=expression, color = Description, fill = Description) +
      geom_violin(show.legend = T, 
                    alpha = .5,
                   position =  position_dodge(width = 1),
                    trim = FALSE) +
      scale_color_manual(values = c("black", "coral4", "steelblue4")) +
      scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
        stat_summary(fun = "median", 
                     geom = "point", 
                     shape = 20, 
                     size = 2,  
                     position =  position_dodge(width = 1),
                     show.legend = FALSE) +
      facet_grid(~stage, scales = "free") +
        labs(y="log2CPM Expression", x = "Life Stage",
             title = paste0(x, " - log2 Counts per Million (CPM)"),
             caption=paste0("produced on ", Sys.time())) +
        theme_bw() 
    
    suppressMessages(ggsave(paste0(x,"_CAIExpression.pdf"),
                            plot = p.Bot.expression,
                            device = "pdf",
                            height = 4,
                            width = 7,
                            path = output.path,
                            useDingbats=FALSE))
    
    p.Bot.expression
})


temp <- lapply(species_names, function(x) {
    dat.select <- dplyr::filter(species.specific.dat.df, 
                                    species == x) %>%
        dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) %>%
      dplyr::ungroup() %>%
      dplyr::select(geneID, species_CAI)
    
    subset.expression <- dat.expression %>%
      dplyr::filter(species == x)%>%
      dplyr::filter(stage == "FLF") %>%
      dplyr::filter(geneID %in% dat.select$geneID)
    
    full_join(dat.select, subset.expression, by = "geneID") %>%
      dplyr::select(!stage)
})
names(temp) <- species_names

caiExp.df <- map_dfr(temp, bind_rows, .id = "species")

caiExp.df <- bind_rows(AllGenes = caiExp.df,
                       HighCAI =  dplyr::filter(caiExp.df, geneID %in% dat.Top.expression$geneID),
                       LowCAI =  dplyr::filter(caiExp.df, geneID %in% dat.Bot.expression$geneID),
                       .id = "Description") %>%
  dplyr::mutate(species = as_factor(species))

caiExp.highExp_plot <- ggplot(caiExp.df) +
   aes(x=Description, y=expression, color = Description, fill = Description) +
      geom_violin(show.legend = T, 
                   position =  position_dodge(width = 1),
                    trim = FALSE,
                  color = 'black') +
  
      #scale_color_manual(values = c("black", "coral4", "steelblue4")) +
      scale_fill_manual(values = c("grey", "coral4", "steelblue4")) +
        stat_summary(fun = "median", 
                     geom = "point", 
                     shape = 20, 
                     size = 2,  
                     position =  position_dodge(width = 1),
                     color = "black",
                     show.legend = FALSE) +
  facet_grid( ~species) +
  labs(y="log2CPM Expression", x = "Gene subsets (base on CAI value) by species",
             title =  "log2 Counts per Million (CPM) in free-living female as a function of codon bias",
       subtitle = "grey = all genes \ncolors = top/bottom 2% CAI",
             caption=paste0("produced on ", Sys.time())) +
        theme_bw() +
  theme(plot.title.position = "plot",
        plot.caption.position = "panel",
        plot.title = element_text(face = "bold",
                                  size = 8, hjust = 0),
        plot.subtitle = element_text(size = 8),
        legend.title = element_text(size = 8),
        axis.title = element_text(size = 8),
        axis.text = element_text(size = 8),
        axis.text.x = element_text(angle = 45, hjust = 1))

caiExp.highExp_plot

suppressPackageStartupMessages({library(car)})
cpm.merged <- rbind(dat.expression %>% dplyr::mutate(condition = "AllGenes"),
                    dat.Top.expression %>% dplyr::mutate(condition = "HighCPM"),
                    dat.Bot.expression %>% dplyr::mutate(condition = "LowCPM")) %>%
    dplyr::mutate(condition = as_factor(condition))%>%
  group_by(species, geneID, condition, stage)

options(contrasts = c("contr.sum", "contr.poly"))

lapply(species_names, function(x) {
cpm.merged.sub <- filter(cpm.merged, species == x)

res.aov2<- aov(expression ~ condition * stage, data = cpm.merged.sub)
Anova(res.aov2, type = "III")
res.aov3<-TukeyHSD(res.aov2, "condition:stage")
as_tibble(res.aov3$`condition:stage`, rownames = "comparison") %>%
  separate(col = comparison, into = c("comp1", "LS1", "comp2", "LS2")) %>%
  dplyr::filter(LS1 == LS2) %>%
  unite(temp1, comp1, LS1, sep = ":")%>%
  unite(temp2, comp2, LS2, sep = ":")%>%
  unite(comparison, temp1, temp2, sep = "-")

})
# Carry out GO enrichment using gProfiler2 ----

run_GO <- function (data){
  GO.geneID <- data %>%
    ungroup() %>%
    group_by(species) %>%
    dplyr::group_map(~ {
      gost.res <- .x %>%
        dplyr::select(geneID, GC) %>%
        unique() 
    }, .keep = TRUE)
  
  names(GO.geneID) <- levels(data$species)
  
  gost.results <- lapply(names(GO.geneID), function(y){
    organism = case_when (y == "Ss" ~ "ststerprjeb528",
                          y == "Sr" ~ "strattprjeb125",
                          y == "Sp" ~ "stpapiprjeb525",
                          y == "Sv" ~ "stveneprjeb530",
                          y == "Ce" ~ "caelegprjna13758")
    gost.res <- gost(GO.geneID[[y]]$geneID,
                     organism = organism, 
                     evcodes = TRUE,
                     correction_method = "fdr")
  })
  names(gost.results) <- names(GO.geneID)
  gost.results
}

find_enrichments <- function (gost.results) {
  # Find GO terms that are enriched with p-values equal or small than a threshold.
  enriched.terms <- lapply(names(gost.results), function(y){
    gost.results[[y]]$result %>%
      as_tibble() %>%
      dplyr::select(term_id, p_value)
  }) 
  names(enriched.terms) <- names(gost.results)
  enriched.terms
}

find_common_enrichments <- function (data, p.val,n_groups){
  data %>%
    dplyr::filter(p_value <= p.val) %>%
    group_by(term_id) %>%
    summarize(n= n()) %>%
    dplyr::filter(n >= n_groups)
}

gost.results.top <- run_GO(dat.Top)
enriched.terms.top <- find_enrichments(gost.results.top)
highTerms.overlap.top <- enriched.terms.top %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 4)



gostplot(gost.results.top$Ss, interactive = T, capped = T) 
gostplot(gost.results.top$Sr, interactive = T, capped = T) 
gostplot(gost.results.top$Sp, interactive = T, capped = T) 
gostplot(gost.results.top$Sv, interactive = T, capped = T) 
gostplot(gost.results.top$Ce, interactive = T, capped = T) 
dplyr::filter(gost.results.top$Ss$result, term_id %in% highTerms.overlap.top$term_id) %>%
  dplyr::select(term_id, term_name, intersection)
dplyr::filter(gost.results.top$Ce$result, term_id %in% highTerms.overlap.top$term_id) %>%
  dplyr::select(term_id, term_name)
temp <- dplyr::filter(highTerms.overlap.top, 
                      !term_id %in% gost.results.top$Ce$result$term_id)
dplyr::filter(gost.results.top$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)
# Carry out GO enrichment using gProfiler2 ----
gost.results.bot <- run_GO(dat.Bot)
enriched.terms.bot <- find_enrichments(gost.results.bot)
highTerms.overlap.bot <- enriched.terms.bot %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 4)

gostplot(gost.results.bot$Ss, interactive = T, capped = T) 
gostplot(gost.results.bot$Sr, interactive = T, capped = T) 
gostplot(gost.results.bot$Sp, interactive = T, capped = T) 
gostplot(gost.results.bot$Sv, interactive = T, capped = T) 
gostplot(gost.results.bot$Ce, interactive = T, capped = T) 
dplyr::filter(gost.results.bot$Ss$result, term_id %in% highTerms.overlap.bot$term_id) %>%
  dplyr::select(term_id, term_name)
dplyr::filter(gost.results.bot$Ce$result, term_id %in% highTerms.overlap.bot$term_id) %>%
  dplyr::select(term_id, term_name)
temp <- dplyr::filter(highTerms.overlap.bot, 
                      !term_id %in% gost.results.bot$Ce$result$term_id)
dplyr::filter(gost.results.bot$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)
dat.ATrich <- species.specific.dat.df %>%
  #dplyr::filter(species != "Ce") %>%
  dplyr::mutate(Description = factor("Top")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::slice_max(order_by = desc(GC), prop = percentile) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))
common.sequences <- inner_join(dat.ATrich, dat.Top) %>%
  tally()

tally(dat.Top) %>%
  dplyr::filter(species != "Ce") %>%
  dplyr::mutate(ratio_shared_seq = common.sequences$n/n)

# Go analysis
gost.results.ATrich <- run_GO(dat.ATrich)
enriched.terms.ATrich <- find_enrichments(gost.results.ATrich)
highTerms.overlap.ATrich <- enriched.terms.ATrich %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 4)

dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% highTerms.overlap.ATrich$term_id) %>%
  dplyr::select(term_id, term_name, intersection)

#
temp <- dplyr::filter(highTerms.overlap.top, 
                      term_id %in% highTerms.overlap.ATrich$term_id)
dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

temp <- dplyr::filter(highTerms.overlap.bot, 
                      term_id %in% highTerms.overlap.ATrich$term_id)
dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

dat.GCrich <- species.specific.dat.df %>%
  #dplyr::filter(species != "Ce") %>%
  dplyr::mutate(Description = factor("Bot")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::slice_max(order_by = GC, prop = percentile) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce")))

common.sequences <- inner_join(dat.GCrich, dat.Bot) %>%
  dplyr::filter(species != "Ce") %>%
  tally()

tally(dat.Bot) %>%
  dplyr::filter(species != "Ce") %>%
  dplyr::mutate(ratio_shared_seq = common.sequences$n/n)

# Go analysis
gost.results.GCrich <- run_GO(dat.GCrich)
enriched.terms.GCrich <- find_enrichments(gost.results.GCrich)
highTerms.overlap.GCrich <- enriched.terms.GCrich %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 4)

dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% highTerms.overlap.GCrich$term_id) %>%
  dplyr::select(term_id, term_name, intersection)

#Overlap btwn GO terms enriched in GC-rich sequences and least Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot, 
                      term_id %in% highTerms.overlap.GCrich$term_id)
dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

#Overlap btwn GO terms enriched in GC-rich sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top, 
                      term_id %in% highTerms.overlap.GCrich$term_id)
dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)


gost.results.highexp <- lapply(species_names, function(x) {
  
   GO.geneID <- dat.expression %>%
     ungroup() %>%
     dplyr::filter(species == x) %>%
    dplyr::filter(stage == "FLF") %>%
    dplyr::slice_max(order_by = expression, prop = percentile) 
  
  organism = case_when (x == "Ss" ~ "ststerprjeb528",
                          x == "Sr" ~ "strattprjeb125",
                          x == "Sp" ~ "stpapiprjeb525",
                          x == "Sv" ~ "stveneprjeb530",
                          x == "Ce" ~ "caelegprjna13758")
   
  gost.res <- gost(GO.geneID$geneID,
                     organism = organism, 
                     evcodes = TRUE,
                     correction_method = "fdr")
  
  })
names(gost.results.highexp) <- species_names
    
enriched.terms.highexp <- find_enrichments(gost.results.highexp)
highTerms.overlap.highexp <- enriched.terms.highexp %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 3)


dplyr::filter(gost.results.highexp$Ss$result, term_id %in% highTerms.overlap.highexp$term_id) %>%
  dplyr::select(term_id, term_name, intersection)

#Overlap btwn GO terms enriched in highly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top, 
                      term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

#Overlap btwn GO terms enriched in highly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot, 
                      term_id %in% highTerms.overlap.highexp$term_id)
dplyr::filter(gost.results.highexp$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)


gost.results.lowexp <- lapply(species_names, function(x) {
  
   GO.geneID <- dat.expression %>%
     ungroup() %>%
     dplyr::filter(species == x) %>%
    dplyr::filter(stage == "FLF") %>%
    dplyr::slice_max(order_by = desc(expression), prop = percentile) 
  
  organism = case_when (x == "Ss" ~ "ststerprjeb528",
                          x == "Sr" ~ "strattprjeb125",
                          x == "Sp" ~ "stpapiprjeb525",
                          x == "Sv" ~ "stveneprjeb530",
                          x == "Ce" ~ "caelegprjna13758")
   
  gost.res <- gost(GO.geneID$geneID,
                     organism = organism, 
                     evcodes = TRUE,
                     correction_method = "fdr")
  
  })
names(gost.results.lowexp) <- species_names
    
enriched.terms.lowexp <- find_enrichments(gost.results.lowexp)
highTerms.overlap.lowexp <- enriched.terms.lowexp %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(species != "Ce") %>%
  find_common_enrichments(.,0.001, 3)


dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% highTerms.overlap.lowexp$term_id) %>%
  dplyr::select(term_id, term_name, intersection)

#Overlap btwn GO terms enriched in poorly expressed sequences and highest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.top, 
                      term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)

#Overlap btwn GO terms enriched in poorly expressed sequences and poorest Str-codon-adapted sequences
temp <- dplyr::filter(highTerms.overlap.bot, 
                      term_id %in% highTerms.overlap.lowexp$term_id)
dplyr::filter(gost.results.lowexp$Ss$result, term_id %in% temp$term_id) %>%
  dplyr::select(term_id, term_name)


# Save distribution plot of codon adaptivness by species
ggsave('../Outputs/Codon_Adaptiveness_Distributions_by_Species.pdf', 
       plot = species_adaptiveness_plot, 
       width = 5.5, height = 4, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

# Save distribution plot of GC content by species
ggsave('../Outputs/GC_Distributions_by_Species.pdf', 
       plot = species_GC_plot, 
       width = 5.5, height = 4, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

# Save plot of CAI vs FLF expression level
ggsave('../Outputs/CAIvsFLFlog2CPM_by_Species.pdf', 
       plot = caiExp.highExp_plot, 
       width = 5.5, height = 3, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

write_excel <- function(data, sheet_facet, filename){
  
  file <- paste(filename,".xlsx",sep = "")
  # Workbook
  to_download <<- createWorkbook()
  
  sapply(seq_along(sheet_facet), function(x){
    addWorksheet(wb = to_download, sheetName = sheet_facet[[x]])
    
    sheet_data <- data %>%
      dplyr::filter(species == sheet_facet[[x]]) %>%
      dplyr::select(!species)
    # # Write Data
    # ## Sheet header
    # writeData(
    #   to_download,
    #   sheet = 1,
    #   x = c(
    #     paste0("Strongyloides Codon Usage Report"),
    #     paste0("Report generated on ", format(Sys.Date(), "%B %d, %Y"))
    #   )
    # )
    # 
    ## Results of codon usage analysis
    writeData(
      to_download,
      sheet = x,
      x = sheet_data,
      startRow = 1,
      startCol = 1,
      headerStyle = createStyle(
        textDecoration = "Bold",
        halign = "center",
        border = "bottom"
      )
    )
    
  })
  saveWorkbook(to_download, file, overwrite=TRUE)
}  

# Save files with top and bottom percentiles of codon adapted genes per species
tbl %>%
      dplyr::select(geneID, species_CAI, GC, Description, species) %>%
  write_excel(levels(tbl$species), '../Outputs/Percentile_CAI_Genes')

# Save list of all GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes
enriched.GO.terms <- rbind(
  lowCAI = lapply(names(gost.results.bot), function(y){
  gost.results.bot[[y]]$result %>%
    as_tibble() %>%
    dplyr::select(-c(query, significant,parents)) %>%
    dplyr::arrange(p_value) %>%
    dplyr::mutate(species = y)
}) %>% bind_rows %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
  dplyr::mutate(Description = "Least-adapted genes")  %>%
  dplyr::relocate(Description,species,term_id, source, term_name, .before = everything()),
highCAI = lapply(names(gost.results.top), function(y){
  gost.results.top[[y]]$result %>%
    as_tibble() %>%
    dplyr::select(-c(query, significant,parents)) %>%
    dplyr::arrange(p_value) %>%
    dplyr::mutate(species = y)
}) %>% bind_rows %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv","Ce"))) %>%
  dplyr::mutate(Description = "Most-adapted genes")  %>%
  dplyr::relocate(Description,species,term_id, source, term_name, .before = everything()) 
)

write_excel(enriched.GO.terms, levels(enriched.GO.terms$species), '../Outputs/Percentile_GO_Analysis')

# Save list of common GO terms for each species, and the genes id that intersect
# between the GO term and the query
# for the top and bottom percentile of codon adapted genes and GT-rich and AT-rich subsets too
common.terms <- rbind (lowCAI = filter(enriched.GO.terms, term_id %in%  highTerms.overlap.bot$term_id) %>%
         dplyr::filter(species == "Ss") %>%
         dplyr::select(term_id, source,term_name, Description),
       highCAI = filter(enriched.GO.terms, term_id %in%  highTerms.overlap.top$term_id) %>%
         dplyr::filter(species == "Ss") %>%
         dplyr::select(term_id, source, term_name, Description),
       ATrich =  dplyr::filter(gost.results.ATrich$Ss$result, term_id %in% highTerms.overlap.ATrich$term_id) %>%
         dplyr::select(term_id, source, term_name) %>%
         dplyr::mutate(Description = "AT-rich genes"),
       GCrich = dplyr::filter(gost.results.GCrich$Ss$result, term_id %in% highTerms.overlap.GCrich$term_id) %>%
         dplyr::select(term_id,  source, term_name)%>%
         dplyr::mutate(Description = "GC-rich genes")
) %>%
  dplyr::mutate(Description = factor(Description)) %>%
  dplyr::rename(species = Description)

  write_excel(common.terms, levels(common.terms$species), '../Outputs/Commonly_Enriched_GO_Terms')

# Save Functional Enrichment Plots for bottom percentile
lapply(names(gost.results.bot), function(y){
  gost.res <- gost.results.bot[[y]]
  
  gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
    labs(title = paste(y, "Bottom", percentile*100,"% Codon Adapted Genes")) +
    theme(plot.title.position = "plot",
          plot.caption.position = "plot",
          plot.title = element_text(face = "bold",
                                    size = 8, hjust = 0))
  gost.ggplot.pub<- publish_gostplot(gost.ggplot,
                                     highlight_terms = highTerms.overlap.bot$term_id)
  ggsave(paste0('../Outputs/',y,'_Bot_CAI_Plot.pdf'), plot = gost.ggplot.pub,
         width = 7, height = 4,
         units = "in", device = "pdf",
         useDingbats = FALSE)
  
})

# Save Functional Enrichment Plots for top percentile
lapply(names(gost.results.top), function(y){
  gost.res <- gost.results.top[[y]]
  
  gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
    labs(title = paste(y, "Top", percentile*100,"% Codon Adapted Genes")) +
    theme(plot.title.position = "plot",
          plot.caption.position = "plot",
          plot.title = element_text(face = "bold",
                                    size = 8, hjust = 0))
  gost.ggplot.pub<- publish_gostplot(gost.ggplot,
                                     highlight_terms = highTerms.overlap.top$term_id)
  ggsave(paste0('../Outputs/',y,'_Top_CAI_Plot.pdf'), plot = gost.ggplot.pub,
         width = 7, height = 5,
         units = "in", device = "pdf",
         useDingbats = FALSE)
  
})